#> 4.2 EVALUATING THE STRUCTURAL TOPIC MODEL
#> Estimation with topical prevalence parameter (Year of publication) - as a covariate
#> Allocate plenty of memory for stm runs
memory.size()
memory.limit(size=500000)
#> 4.2.1 - Estimate choice of number of topics using searchK function
set.seed(54321)
K <-c (3, 6, 9, 12, 15, 18, 21)
k.result <- stm::searchK(out$documents, out$vocab, K, prevalence =~ DATE, data = out$meta)
#> Plot the diganostic values of searchK function (e.g. semantic coherence
png(here("Out", "Topic_modelling", "SearchK", "Plot1_searchK_results.png"), width = 800, height = 600)
kplot <- plot.searchK(k.result)
dev.off()
#> Notes:
# - Held-out likelihood essential a measure of complexity - an "old" metric that doesn't provide good "human" results: https://www.youtube.com/watch?v=rfHCronRgQU&t=42s
#> Note, the results of searchK may not be as useful as comparing semantic coherence with exclusivity (see later)
#> FITTING OF MODEL ANMD EVALUATION OF #K (topics)
#> Generate a range of stm models with different values of K
#> For each model, generate a series of outputs, including top topics, semantic coherence vs. exclusivity, and word clouds - in order to choose "final K"
#> Chosen 10 to 50 in 10 increments of 10 as we think more than 50 topics would be difficult to interpret given time and resources
library(Rtsne)
library(rsvd)
library(geometry)
#> K = 0
#> When initialization type is set to "Spectral" the user can specify K = 0 to use the algorithm of Lee and Mimno (2014) to select the number of topics. The core idea of the spectral initialization is to approximately find the vertices of the convex hull of the word co-occurrences. The algorithm of Lee and Mimno (2014) projects the matrix into a low dimensional space using t-distributed stochastic neighbor embedding (Van der Maaten 2014) and then exactly solves for the convex hull. This has the advantage of automatically selecting the number of topics. The added randomness from the projection means that the algorithm is not deterministic like the standard "Spectral" initialization type. Running it with a different seed can result in not only different results but a different number of topics. We emphasize that this procedure has no particular statistical guarantees and should not be seen as estimating the “true” number of topics. However it can be useful to start and has the computational advantage that it only needs to be run once.
stm.0 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 0, prevalence =~ DATE, max.em.its = 1000, data = out$meta, init.type = "Spectral")
#> Run stm with specified number of topics (note: no need to set seed with spectral initialization)
#
# #> K = 2 - DOES NOT WORK WITH ONLY 2 TOPICS
# stm.2 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 2, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 3
stm.3 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 3, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 4
stm.4 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 4, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 5
stm.5 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 5, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 6
stm.6 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 6, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 9
stm.9 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 9, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 12
stm.12 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 12, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 15
stm.15 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 15, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 18
stm.18 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 18, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
#> K = 21
stm.21 <- stm::stm(documents = out$documents, vocab = out$vocab, K = 21, prevalence =~ DATE, max.em.its = 100, data = out$meta, init.type = "Spectral")
# with spline "s()" applied to DATE
# #> K = 3
# stm.2.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 2, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 3
# stm.3.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 3, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 4
# stm.4.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 4, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 4
# stm.5.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 5, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 6
# stm.6.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 6, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 9
# stm.9.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 9, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 12
# stm.12.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 12, prevalence =~s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 15
# stm.15.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 15, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 18
# stm.18.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 18, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")
# #> K = 21
# stm.21.sp <- stm::stm(documents = out$documents, vocab = out$vocab, K = 21, prevalence =~ s(DATE), max.em.its = 100, data = out$meta, init.type = "Spectral")